Add JSONPath for hash paths and add JSON parsing to the WebsiteAgent.

Andrew Cantino %!s(int64=11) %!d(string=hace) años
padre
commit
b876759b7b

+ 1 - 0
Gemfile

@@ -8,6 +8,7 @@ gem 'kaminari'
8 8
 gem 'bootstrap-kaminari-views'
9 9
 gem "rufus-scheduler", :require => false
10 10
 gem 'json', '>= 1.7.7'
11
+gem 'jsonpath'
11 12
 
12 13
 gem 'delayed_job', :git => 'https://github.com/wok/delayed_job' # Until the YAML issues are fixed in master.
13 14
 gem 'delayed_job_active_record', "~> 0.3.3" # newer was giving a strange MySQL error

+ 3 - 0
Gemfile.lock

@@ -115,6 +115,8 @@ GEM
115 115
       jquery-rails
116 116
       railties (>= 3.1.0)
117 117
     json (1.7.7)
118
+    jsonpath (0.5.1)
119
+      multi_json
118 120
     kaminari (0.14.1)
119 121
       actionpack (>= 3.0.0)
120 122
       activesupport (>= 3.0.0)
@@ -275,6 +277,7 @@ DEPENDENCIES
275 277
   geokit-rails3
276 278
   jquery-rails
277 279
   json (>= 1.7.7)
280
+  jsonpath
278 281
   kaminari
279 282
   kramdown
280 283
   mysql2

+ 1 - 21
app/models/agent.rb

@@ -85,27 +85,7 @@ class Agent < ActiveRecord::Base
85 85
   end
86 86
 
87 87
   def make_message(payload, message = options[:message])
88
-    message.gsub(/<([^>]+)>/) { value_at(payload, $1) || "??" }
89
-  end
90
-
91
-  def value_at(data, path)
92
-    if data.is_a?(Hash)
93
-      path.split(".").inject(data) { |memo, segment|
94
-        if memo
95
-          if memo[segment]
96
-            memo[segment]
97
-          elsif memo[segment.to_sym]
98
-            memo[segment.to_sym]
99
-          else
100
-            nil
101
-          end
102
-        else
103
-          nil
104
-        end
105
-      }.to_s
106
-    else
107
-      data
108
-    end
88
+    message.gsub(/<([^>]+)>/) { Utils.value_at(payload, $1) || "??" }
109 89
   end
110 90
 
111 91
   def set_default_schedule

+ 3 - 3
app/models/agents/peak_detector_agent.rb

@@ -7,7 +7,7 @@ module Agents
7 7
     description <<-MD
8 8
       Use a PeakDetectorAgent to watch for peaks in an event stream.  When a peak is detected, the resulting Event will have a payload message of `message`.  You can include extractions in the message, for example: `I saw a bar of: <foo.bar>`
9 9
 
10
-      The `value_path` value is a hash path to the value of interest.  `group_by_path` is a hash path that will be used to group values, if present.
10
+      The `value_path` value is a [JSONPaths](http://goessner.net/articles/JsonPath/) to the value of interest.  `group_by_path` is a hash path that will be used to group values, if present.
11 11
 
12 12
       Set `expected_receive_period_in_days` to the maximum amount of time that you'd expect to pass between Events being received by this Agent.
13 13
 
@@ -106,13 +106,13 @@ module Agents
106 106
     end
107 107
 
108 108
     def group_for(event)
109
-      ((options[:group_by_path].present? && value_at(event.payload, options[:group_by_path])) || 'no_group').to_sym
109
+      ((options[:group_by_path].present? && Utils.value_at(event.payload, options[:group_by_path])) || 'no_group').to_sym
110 110
     end
111 111
 
112 112
     def remember(group, event)
113 113
       memory[:data] ||= {}
114 114
       memory[:data][group] ||= []
115
-      memory[:data][group] << [value_at(event.payload, options[:value_path]), event.created_at.to_i]
115
+      memory[:data][group] << [Utils.value_at(event.payload, options[:value_path]), event.created_at.to_i]
116 116
       cleanup group
117 117
     end
118 118
 

+ 2 - 9
app/models/agents/trigger_agent.rb

@@ -7,14 +7,7 @@ module Agents
7 7
     description <<-MD
8 8
       Use a TriggerAgent to watch for a specific value in an Event payload.
9 9
 
10
-      The `rules` array contains hashes of `path`, `value`, and `type`.  The `path` value is a dotted path through a hash, for example `foo.bar` would return `hello` from this structure:
11
-
12
-          {
13
-            :foo => {
14
-              :bar => "hello"
15
-            },
16
-            :something => "else"
17
-          }
10
+      The `rules` array contains hashes of `path`, `value`, and `type`.  The `path` value is a dotted path through a hash in [JSONPaths](http://goessner.net/articles/JsonPath/) syntax.
18 11
 
19 12
       The `type` can be one of #{VALID_COMPARISON_TYPES.map { |t| "`#{t}`" }.to_sentence} and compares with the `value`.
20 13
 
@@ -55,7 +48,7 @@ module Agents
55 48
     def receive(incoming_events)
56 49
       incoming_events.each do |event|
57 50
         match = options[:rules].all? do |rule|
58
-          value_at_path = value_at(event[:payload], rule[:path])
51
+          value_at_path = Utils.value_at(event[:payload], rule[:path])
59 52
           case rule[:type]
60 53
             when "regex"
61 54
               value_at_path.to_s =~ Regexp.new(rule[:value], Regexp::IGNORECASE)

+ 59 - 16
app/models/agents/website_agent.rb

@@ -7,12 +7,15 @@ module Agents
7 7
     cannot_receive_events!
8 8
 
9 9
     description <<-MD
10
-      The WebsiteAgent scrapes a website and creates Events based on any changes in the results.
10
+      The WebsiteAgent scrapes a website, XML document, or JSON feed and creates Events based on the results.
11 11
 
12
-      Specify the website's `url` and select a `mode` for when to create Events based on the scraped data, either `all` or `on_change`.
12
+      Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all` or `on_change`.
13 13
 
14
-      To tell the Agent how to scrape the site, specify `extract` as a hash with keys naming the extractions and values of hashes.
15
-      These subhashes specify how to extract with a `:css` CSS selector and either `:text => true` or `attr` pointing to an attribute name to grab.  An example:
14
+      The `type` value can be `xml`, `html`, or `json`.
15
+
16
+      To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
17
+
18
+      When parsing HTML or XML, these sub-hashes specify how to extract with a `:css` CSS selector and either `:text => true` or `attr` pointing to an attribute name to grab.  An example:
16 19
 
17 20
           :extract => {
18 21
             :url => { :css => "#comic img", :attr => "src" },
@@ -20,12 +23,20 @@ module Agents
20 23
             :body_text => { :css => "div.main", :text => true }
21 24
           }
22 25
 
23
-      Note that whatever you extract MUST have the same number of matches for each extractor.  E.g., if you're extracting rows, all extractors must match all rows.
26
+      When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
27
+
28
+          :extract => {
29
+            :title => { :path => "results.data[*].title" },
30
+            :description => { :path => "results.data[*].description" }
31
+          }
32
+
33
+      Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor.  E.g., if you're extracting rows, all extractors must match all rows.
24 34
 
25 35
       Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent.
26 36
     MD
27 37
 
28
-    event_description do <<-MD
38
+    event_description do
39
+      <<-MD
29 40
       Events will have the fields you specified.  Your options look like:
30 41
 
31 42
           #{PP.pp(options[:extract], "")}
@@ -44,6 +55,7 @@ module Agents
44 55
       {
45 56
           :expected_update_period_in_days => "2",
46 57
           :url => "http://xkcd.com",
58
+          :type => "html",
47 59
           :mode => :on_change,
48 60
           :extract => {
49 61
               :url => {:css => "#comic img", :attr => "src"},
@@ -60,18 +72,22 @@ module Agents
60 72
       hydra = Typhoeus::Hydra.new
61 73
       request = Typhoeus::Request.new(options[:url], :followlocation => true)
62 74
       request.on_complete do |response|
63
-        doc = (options[:type].to_s == "xml" || options[:url] =~ /\.(rss|xml)$/i) ? Nokogiri::XML(response.body) : Nokogiri::HTML(response.body)
75
+        doc = parse(response.body)
64 76
         output = {}
65 77
         options[:extract].each do |name, extraction_details|
66
-          output[name] = doc.css(extraction_details[:css]).map { |node|
67
-            if extraction_details[:attr]
68
-              node.attr(extraction_details[:attr])
69
-            elsif extraction_details[:text]
70
-              node.text()
71
-            else
72
-              raise StandardError, ":attr or :text is required on each of the extraction patterns."
73
-            end
74
-          }
78
+          if extraction_type == "json"
79
+            output[name] = Utils.values_at(doc, extraction_details[:path])
80
+          else
81
+            output[name] = doc.css(extraction_details[:css]).map { |node|
82
+              if extraction_details[:attr]
83
+                node.attr(extraction_details[:attr])
84
+              elsif extraction_details[:text]
85
+                node.text()
86
+              else
87
+                raise StandardError, ":attr or :text is required on HTML or XML extraction patterns"
88
+              end
89
+            }
90
+          end
75 91
         end
76 92
 
77 93
         num_unique_lengths = options[:extract].keys.map { |name| output[name].length }.uniq
@@ -94,5 +110,32 @@ module Agents
94 110
       hydra.queue request
95 111
       hydra.run
96 112
     end
113
+
114
+    private
115
+
116
+    def extraction_type
117
+      (options[:type] || begin
118
+        if options[:url] =~ /\.(rss|xml)$/i
119
+          "xml"
120
+        elsif options[:url] =~ /\.json$/i
121
+          "json"
122
+        else
123
+          "html"
124
+        end
125
+      end).to_s
126
+    end
127
+
128
+    def parse(data)
129
+      case extraction_type
130
+        when "xml"
131
+          Nokogiri::XML(data)
132
+        when "json"
133
+          JSON.parse(data)
134
+        when "html"
135
+          Nokogiri::HTML(data)
136
+        else
137
+          raise "Unknown extraction type #{extraction_type}"
138
+      end
139
+    end
97 140
   end
98 141
 end

+ 1 - 1
config/initializers/multi_xml_patch.rb

@@ -15,7 +15,7 @@ module MultiXml
15 15
     end
16 16
   end
17 17
 
18
-  DISALLOWED_XML_TYPES = %w(symbol yaml)
18
+  DISALLOWED_XML_TYPES = %w(symbol yaml) unless defined?(DISALLOWED_XML_TYPES)
19 19
 
20 20
   class << self
21 21
     def parse(xml, options={})

+ 10 - 0
lib/utils.rb

@@ -1,3 +1,5 @@
1
+require 'jsonpath'
2
+
1 3
 module Utils
2 4
   # Unindents if the indentation is 2 or more characters.
3 5
   def self.unindent(s)
@@ -14,4 +16,12 @@ module Utils
14 16
         object
15 17
     end
16 18
   end
19
+
20
+  def self.value_at(data, path)
21
+    values_at(data, path).first
22
+  end
23
+
24
+  def self.values_at(data, path)
25
+    JsonPath.new(path).on(data.is_a?(String) ? data : data.to_json)
26
+  end
17 27
 end

+ 22 - 0
spec/lib/utils_spec.rb

@@ -0,0 +1,22 @@
1
+require 'spec_helper'
2
+
3
+describe Utils do
4
+  describe "#value_at" do
5
+    it "returns the value at a JSON path" do
6
+      Utils.value_at({ :foo => { :bar => :baz }}.to_json, "foo.bar").should == "baz"
7
+      Utils.value_at({ :foo => { :bar => { :bing => 2 } }}, "foo.bar.bing").should == 2
8
+    end
9
+
10
+    it "returns nil when the path cannot be followed" do
11
+      Utils.value_at({ :foo => { :bar => :baz }}, "foo.bing").should be_nil
12
+    end
13
+  end
14
+
15
+  describe "#values_at" do
16
+    it "returns arrays of matching values" do
17
+      Utils.values_at({ :foo => { :bar => :baz }}, "foo.bar").should == %w[baz]
18
+      Utils.values_at({ :foo => [ { :bar => :baz }, { :bar => :bing } ]}, "foo[*].bar").should == %w[baz bing]
19
+      Utils.values_at({ :foo => [ { :bar => :baz }, { :bar => :bing } ]}, "foo[*].bar").should == %w[baz bing]
20
+    end
21
+  end
22
+end

+ 4 - 4
spec/models/agents/peak_detector_agent_spec.rb

@@ -22,9 +22,9 @@ describe Agents::PeakDetectorAgent do
22 22
       events = build_events(:keys => [:count, :filter],
23 23
                             :values => [[1, "something"], [2, "something"], [3, "else"]])
24 24
       @agent.receive events
25
-      @agent.memory[:data][:something].map(&:first).should == %w[1 2]
25
+      @agent.memory[:data][:something].map(&:first).should == [1, 2]
26 26
       @agent.memory[:data][:something].last.last.should be_within(10).of((100 - 1).hours.ago.to_i)
27
-      @agent.memory[:data][:else].first.first.should == "3"
27
+      @agent.memory[:data][:else].first.first.should == 3
28 28
       @agent.memory[:data][:else].first.last.should be_within(10).of((100 - 2).hours.ago.to_i)
29 29
     end
30 30
 
@@ -32,7 +32,7 @@ describe Agents::PeakDetectorAgent do
32 32
       @agent.options[:group_by_path] = ""
33 33
       events = build_events(:keys => [:count], :values => [[1], [2]])
34 34
       @agent.receive events
35
-      @agent.memory[:data][:no_group].map(&:first).should == %w[1 2]
35
+      @agent.memory[:data][:no_group].map(&:first).should == [1, 2]
36 36
     end
37 37
 
38 38
     it "keeps a rolling window of data" do
@@ -40,7 +40,7 @@ describe Agents::PeakDetectorAgent do
40 40
       @agent.receive build_events(:keys => [:count],
41 41
                                   :values => [1, 2, 3, 4, 5, 6, 7, 8].map {|i| [i]},
42 42
                                   :pattern => { :filter => "something" })
43
-      @agent.memory[:data][:something].map(&:first).should == %w[4 5 6 7 8]
43
+      @agent.memory[:data][:something].map(&:first).should == [4, 5, 6, 7, 8]
44 44
     end
45 45
 
46 46
     it "finds peaks" do

+ 0 - 7
spec/models/agents/trigger_agent_spec.rb

@@ -120,13 +120,6 @@ describe Agents::TriggerAgent do
120 120
         @checker.receive([@event])
121 121
       }.should_not change { Event.count }
122 122
 
123
-
124
-      @event.payload = "world"
125
-      @checker.options[:rules].first[:path] = "anything"
126
-      lambda {
127
-        @checker.receive([@event])
128
-      }.should change { Event.count }.by(1)
129
-
130 123
       @checker.options[:rules].first[:value] = "hi"
131 124
       lambda {
132 125
         @checker.receive([@event])

+ 79 - 0
spec/models/agents/website_agent_spec.rb

@@ -6,6 +6,7 @@ describe Agents::WebsiteAgent do
6 6
     @site = {
7 7
         :name => "XKCD",
8 8
         :expected_update_period_in_days => 2,
9
+        :type => "html",
9 10
         :url => "http://xkcd.com",
10 11
         :mode => :on_change,
11 12
         :extract => {
@@ -41,4 +42,82 @@ describe Agents::WebsiteAgent do
41 42
       }.should raise_error(StandardError, /Got an uneven number of matches/)
42 43
     end
43 44
   end
45
+
46
+  describe "parsing" do
47
+    it "parses CSS" do
48
+      @checker.check
49
+      event = Event.last
50
+      event.payload[:url].should == "http://imgs.xkcd.com/comics/evolving.png"
51
+      event.payload[:title].should =~ /^Biologists play reverse/
52
+    end
53
+
54
+    describe "JSON" do
55
+      it "works with paths" do
56
+        json = {
57
+            :response => {
58
+                :version => 2,
59
+                :title => "hello!"
60
+            }
61
+        }
62
+        stub_request(:any, /json-site/).to_return(:body => json.to_json, :status => 200)
63
+        site = {
64
+            :name => "Some JSON Response",
65
+            :expected_update_period_in_days => 2,
66
+            :type => "json",
67
+            :url => "http://json-site.com",
68
+            :mode => :on_change,
69
+            :extract => {
70
+                :version => { :path => "response.version" },
71
+                :title => { :path => "response.title" }
72
+            }
73
+        }
74
+        checker = Agents::WebsiteAgent.new(:name => "Weather Site", :options => site)
75
+        checker.user = users(:bob)
76
+        checker.save!
77
+
78
+        checker.check
79
+        event = Event.last
80
+        event.payload[:version].should == 2
81
+        event.payload[:title].should == "hello!"
82
+      end
83
+
84
+      it "can handle arrays" do
85
+        json = {
86
+            :response => {
87
+                :data => [
88
+                    { :title => "first", :version => 2 },
89
+                    { :title => "second", :version => 2.5 }
90
+                ]
91
+            }
92
+        }
93
+        stub_request(:any, /json-site/).to_return(:body => json.to_json, :status => 200)
94
+        site = {
95
+            :name => "Some JSON Response",
96
+            :expected_update_period_in_days => 2,
97
+            :type => "json",
98
+            :url => "http://json-site.com",
99
+            :mode => :on_change,
100
+            :extract => {
101
+                :title => { :path => "response.data[*].title" },
102
+                :version => { :path => "response.data[*].version" }
103
+            }
104
+        }
105
+        checker = Agents::WebsiteAgent.new(:name => "Weather Site", :options => site)
106
+        checker.user = users(:bob)
107
+        checker.save!
108
+
109
+        lambda {
110
+          checker.check
111
+        }.should change { Event.count }.by(2)
112
+
113
+        event = Event.all[-1]
114
+        event.payload[:version].should == 2.5
115
+        event.payload[:title].should == "second"
116
+
117
+        event = Event.all[-2]
118
+        event.payload[:version].should == 2
119
+        event.payload[:title].should == "first"
120
+      end
121
+    end
122
+  end
44 123
 end